import time
import pandas as pd

import requests
from bs4 import BeautifulSoup


def generate_url(i):
    return f'https://news.nau.edu.cn/5796/list{i}.htm'


first_page = f'https://news.nau.edu.cn/5796/list.htm'
current_page = first_page

# req = requests.get(current_page)
# soup = BeautifulSoup(req.content, features='lxml')
# all_pages = int(soup.select('.all_pages')[0].text.strip())
# for i in range(1, all_pages+1):
#     print(i)

titles = []
dates = []
urls = []
contents = []


while True:
    req = requests.get(current_page)
    soup = BeautifulSoup(req.content, features='lxml')
    lis = soup.select('.cols_list > .cols')
    for li in lis:
        href = li.select('.cols_title a')[0]['href'].strip()
        if href.startswith('https://mp.weixin.qq.com/'):
            continue
        title = li.select('.cols_title a')[0].text.strip()
        titles.append(title)
        url = 'https://news.nau.edu.cn' + href
        urls.append(url)
        date = li.select('.cols_meta')[0].text.strip()
        dates.append(date)
        print(title, date)
        req2 = requests.get(url)
        soup2 = BeautifulSoup(req2.content, features='lxml')
        content = soup2.select('.wp_articlecontent')[0].text
        contents.append(content)
        print(content)
    next = soup.select('.next')[0]['href'].strip()
    if next == 'javascript:void(0);':
        break
    current_page = 'https://news.nau.edu.cn' + next
    # time.sleep(2)

data = pd.DataFrame({
    'title': titles,
    'date': dates,
    'url': urls,
    'content': contents
})
data.to_excel('yaowen.xlsx', index=False)
